knitr::opts_chunk$set(echo = TRUE)
library(fitdistrplus)
## Loading required package: MASS
## Loading required package: survival
library(logspline)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:MASS':
## 
##     select
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DESeq2)
## Loading required package: S4Vectors
## Loading required package: stats4
## Loading required package: BiocGenerics
## 
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:dplyr':
## 
##     combine, intersect, setdiff, union
## The following objects are masked from 'package:stats':
## 
##     IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
## 
##     anyDuplicated, aperm, append, as.data.frame, basename, cbind,
##     colnames, dirname, do.call, duplicated, eval, evalq, Filter, Find,
##     get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply,
##     match, mget, order, paste, pmax, pmax.int, pmin, pmin.int,
##     Position, rank, rbind, Reduce, rownames, sapply, setdiff, sort,
##     table, tapply, union, unique, unsplit, which.max, which.min
## 
## Attaching package: 'S4Vectors'
## The following objects are masked from 'package:dplyr':
## 
##     first, rename
## The following object is masked from 'package:utils':
## 
##     findMatches
## The following objects are masked from 'package:base':
## 
##     expand.grid, I, unname
## Loading required package: IRanges
## 
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
## 
##     collapse, desc, slice
## Loading required package: GenomicRanges
## Loading required package: GenomeInfoDb
## Loading required package: SummarizedExperiment
## Loading required package: MatrixGenerics
## Loading required package: matrixStats
## 
## Attaching package: 'matrixStats'
## The following object is masked from 'package:dplyr':
## 
##     count
## 
## Attaching package: 'MatrixGenerics'
## The following objects are masked from 'package:matrixStats':
## 
##     colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse,
##     colCounts, colCummaxs, colCummins, colCumprods, colCumsums,
##     colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs,
##     colMads, colMaxs, colMeans2, colMedians, colMins, colOrderStats,
##     colProds, colQuantiles, colRanges, colRanks, colSdDiffs, colSds,
##     colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads,
##     colWeightedMeans, colWeightedMedians, colWeightedSds,
##     colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
##     rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods,
##     rowCumsums, rowDiffs, rowIQRDiffs, rowIQRs, rowLogSumExps,
##     rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins,
##     rowOrderStats, rowProds, rowQuantiles, rowRanges, rowRanks,
##     rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs, rowVars,
##     rowWeightedMads, rowWeightedMeans, rowWeightedMedians,
##     rowWeightedSds, rowWeightedVars
## Loading required package: Biobase
## Welcome to Bioconductor
## 
##     Vignettes contain introductory material; view with
##     'browseVignettes()'. To cite Bioconductor, see
##     'citation("Biobase")', and for packages 'citation("pkgname")'.
## 
## Attaching package: 'Biobase'
## The following object is masked from 'package:MatrixGenerics':
## 
##     rowMedians
## The following objects are masked from 'package:matrixStats':
## 
##     anyMissing, rowMedians
library(magrittr)
## 
## Attaching package: 'magrittr'
## The following object is masked from 'package:GenomicRanges':
## 
##     subtract
library(pheatmap)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ✔ readr     2.1.5
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::%within%() masks IRanges::%within%()
## ✖ IRanges::collapse()   masks dplyr::collapse()
## ✖ Biobase::combine()    masks BiocGenerics::combine(), dplyr::combine()
## ✖ matrixStats::count()  masks dplyr::count()
## ✖ IRanges::desc()       masks dplyr::desc()
## ✖ tidyr::expand()       masks S4Vectors::expand()
## ✖ tidyr::extract()      masks magrittr::extract()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ S4Vectors::first()    masks dplyr::first()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ ggplot2::Position()   masks BiocGenerics::Position(), base::Position()
## ✖ purrr::reduce()       masks GenomicRanges::reduce(), IRanges::reduce()
## ✖ S4Vectors::rename()   masks dplyr::rename()
## ✖ lubridate::second()   masks S4Vectors::second()
## ✖ lubridate::second<-() masks S4Vectors::second<-()
## ✖ dplyr::select()       masks MASS::select()
## ✖ purrr::set_names()    masks magrittr::set_names()
## ✖ IRanges::slice()      masks dplyr::slice()
## ✖ magrittr::subtract()  masks GenomicRanges::subtract()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mnormt)
library(nlcor)
library(RColorBrewer)
gencode.vM33.All.GeneLengths <- read.table(file="gencode.vM33.All.GeneLengths.txt",sep="\t",col.names = c("GeneID","GeneLength"))

# Convert to Numeric
gencode.vM33.All.GeneLengths <- gencode.vM33.All.GeneLengths %>%
  mutate(across(-1, as.numeric))

row.names(gencode.vM33.All.GeneLengths) <- sub("\\..*", "", gencode.vM33.All.GeneLengths$GeneID)
hist(gencode.vM33.All.GeneLengths$GeneLength)

gencode.vM33.All.GeneLengths$log2.GeneLength <- log2(gencode.vM33.All.GeneLengths$GeneLength) + 1

par(mar = c(5, 5, 2, 2))
hist(gencode.vM33.All.GeneLengths$log2.GeneLength, freq = FALSE)
lines(density(gencode.vM33.All.GeneLengths$log2.GeneLength))

par(mar = c(5, 5, 2, 2))
descdist(gencode.vM33.All.GeneLengths$log2.GeneLength, discrete = FALSE)

## summary statistics
## ------
## min:  4.321928   max:  22.4976 
## median:  12.66133 
## mean:  12.66346 
## estimated sd:  3.081378 
## estimated skewness:  0.06684799 
## estimated kurtosis:  2.206873
fit.norm <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "norm")
par(mar = c(5, 5, 2, 2))
plot(fit.norm)

fit.weibull <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "weibull")
par(mar = c(5, 5, 2, 2))
plot(fit.weibull)

fit.lognorm <- fitdist(gencode.vM33.All.GeneLengths$log2.GeneLength, "lnorm")
par(mar = c(5, 5, 2, 2))
plot(fit.lognorm)

fit.norm$aic
## [1] 289754.7
fit.weibull$aic
## [1] 289708.8
fit.lognorm$aic
## [1] 291502.9

Choose top and bottom 5% genes and median genes in our DESeq datasets for sampling.

PDDo <- "/Users/mar/Library/CloudStorage/GoogleDrive-mohitharikatla.education@gmail.com/My Drive/RR9_MurineSequencingData/Thesis/02_rr9/00.1_Overlap_DESeq/PostDESeq_overlap.RData"
load(PDDo)

Gene Length Comparison (Overlap)

gencode.vM33.All.GeneLengths.filtered <- gencode.vM33.All.GeneLengths[Overlap_names,]
median_value <- median(gencode.vM33.All.GeneLengths.filtered$log2.GeneLength)

# Calculate the absolute differences between each data point and the median
gencode.vM33.All.GeneLengths.filtered$ab_diff <- abs(gencode.vM33.All.GeneLengths.filtered$log2.GeneLength - median_value)

median_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:500],]

top_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:500],]

bottom_500_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:500],]
sample_genes <- list(
  median500 = median_500_genes,
  top500 = top_500_genes,
  bottom500 = bottom_500_genes
)

ONT_median.ds <- ONT_DESeq_overlap.ds[row.names(median_500_genes), ]
Illumina_median.ds <- Illumina_DESeq_overlap.ds[row.names(median_500_genes), ]

ONT_top.ds <- ONT_DESeq_overlap.ds[row.names(top_500_genes), ]
Illumina_top.ds <- Illumina_DESeq_overlap.ds[row.names(top_500_genes), ]

ONT_bottom.ds <- ONT_DESeq_overlap.ds[row.names(bottom_500_genes), ]
Illumina_bottom.ds <- Illumina_DESeq_overlap.ds[row.names(bottom_500_genes), ]
ONT_median.rlog <- rlog(ONT_median.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
Illumina_median.rlog <- rlog(Illumina_median.ds,blind=TRUE)

plotPCA(ONT_median.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

plotPCA(Illumina_median.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_top.rlog <- rlog(ONT_top.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
Illumina_top.rlog <- rlog(Illumina_top.ds,blind=TRUE)

plotPCA(ONT_top.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

plotPCA(Illumina_top.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_bottom.rlog <- rlog(ONT_bottom.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
Illumina_bottom.rlog <- rlog(Illumina_bottom.ds,blind=TRUE)

plotPCA(ONT_bottom.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

plotPCA(Illumina_bottom.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

design(ONT_bottom.ds) <- ~ condition
ONT_bottom.ds$condition %<>% relevel(ref="GC")
ONT_bottom.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_bottom.ds %<>% nbinomWaldTest()

ONT_bottom.results <- results(ONT_bottom.ds,independentFiltering = TRUE,alpha = 0.05)

design(Illumina_bottom.ds) <- ~ condition
Illumina_bottom.ds$condition %<>% relevel(ref="GC")
Illumina_bottom.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_bottom.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_bottom.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
Illumina_bottom.ds %<>% nbinomWaldTest()

Illumina_bottom.results <- results(Illumina_bottom.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_top.ds) <- ~ condition
ONT_top.ds$condition %<>% relevel(ref="GC")
ONT_top.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 8 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_top.ds %<>% nbinomWaldTest()

ONT_top.results <- results(ONT_top.ds,independentFiltering = TRUE,alpha = 0.05)

design(Illumina_top.ds) <- ~ condition
Illumina_top.ds$condition %<>% relevel(ref="GC")
Illumina_top.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 3 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_top.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_top.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
Illumina_top.ds %<>% nbinomWaldTest()

Illumina_top.results <- results(Illumina_top.ds,independentFiltering = TRUE,alpha = 0.05)
design(ONT_median.ds) <- ~ condition
ONT_median.ds$condition %<>% relevel(ref="GC")
ONT_median.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_median.ds %<>% nbinomWaldTest()

ONT_median.results <- results(ONT_median.ds,independentFiltering = TRUE,alpha = 0.05)

design(Illumina_median.ds) <- ~ condition
Illumina_median.ds$condition %<>% relevel(ref="GC")
Illumina_median.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
Illumina_median.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
Illumina_median.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
Illumina_median.ds %<>% nbinomWaldTest()

Illumina_median.results <- results(Illumina_median.ds,independentFiltering = TRUE,alpha = 0.05)
ONT_median.results <- ONT_median.results %>% `[`(order(.$padj),)
head(ONT_median.results)
## log2 fold change (MLE): condition F vs GC 
## Wald test p-value: condition F vs GC 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000029343  6.729932      -2.502332  0.709312  -3.52783 0.000418983
## ENSMUSG00000018752 11.886726      -0.434294  0.136538  -3.18075 0.001468967
## ENSMUSG00000039382  4.039128       0.745970  0.257303   2.89919 0.003741252
## ENSMUSG00000118663  1.238149       1.113364  0.427758   2.60279 0.009246902
## ENSMUSG00000030722  0.528138       1.569297  0.580396   2.70384 0.006854349
## ENSMUSG00000079048  2.664272      -0.798253  0.299546  -2.66488 0.007701679
##                         padj
##                    <numeric>
## ENSMUSG00000029343  0.209492
## ENSMUSG00000018752  0.367242
## ENSMUSG00000039382  0.623542
## ENSMUSG00000118663  0.660493
## ENSMUSG00000030722  0.660493
## ENSMUSG00000079048  0.660493
par(mfrow=c(1,2))
plotCounts(ONT_median.ds, gene="ENSMUSG00000029343", normalized = TRUE, xlab="")
plotCounts(ONT_median.ds, gene = which.max(ONT_median.results$padj), xlab="",
           main = "Gene with max. p.adj.\n(=least significant)")

Illumina_median.results <- Illumina_median.results %>% `[`(order(.$padj),)
head(Illumina_median.results)
## log2 fold change (MLE): condition F vs GC 
## Wald test p-value: condition F vs GC 
## DataFrame with 6 rows and 6 columns
##                     baseMean log2FoldChange     lfcSE      stat      pvalue
##                    <numeric>      <numeric> <numeric> <numeric>   <numeric>
## ENSMUSG00000018752   269.844      -0.589231 0.0944732  -6.23702 4.45987e-10
## ENSMUSG00000026034  1143.537       0.228240 0.0440113   5.18593 2.14936e-07
## ENSMUSG00000038777   171.765      -0.474810 0.0981574  -4.83723 1.31662e-06
## ENSMUSG00000099696   166.496       0.391034 0.0964097   4.05597 4.99276e-05
## ENSMUSG00000042109  1100.254      -0.192379 0.0474723  -4.05244 5.06854e-05
## ENSMUSG00000013833   534.025      -0.231484 0.0571884  -4.04775 5.17120e-05
##                           padj
##                      <numeric>
## ENSMUSG00000018752 1.23092e-07
## ENSMUSG00000026034 2.96612e-05
## ENSMUSG00000038777 1.21129e-04
## ENSMUSG00000099696 2.37875e-03
## ENSMUSG00000042109 2.37875e-03
## ENSMUSG00000013833 2.37875e-03
par(mfrow=c(1,2))
plotCounts(Illumina_median.ds, gene="ENSMUSG00000018752", normalized = TRUE, xlab="")
plotCounts(Illumina_median.ds, gene = which.max(Illumina_median.results$padj), xlab="",
           main = "Gene with max. p.adj.\n(=least significant)")

# identify genes with the desired adjusted p-value cut-off
Illumina_median.DGEgenes <- rownames(subset(Illumina_median.results, padj < 0.05)) 
# extract rlog-transformed values into a matrix
Illumina_median_rlog.dge <- Illumina_median.rlog[Illumina_median.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(Illumina_median_rlog.dge, scale="row",
         show_rownames=FALSE, main="Illumina DGE",
         color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))

# identify genes with the desired adjusted p-value cut-off
Illumina_top.DGEgenes <- rownames(subset(Illumina_top.results, padj < 0.05)) 
# extract rlog-transformed values into a matrix
Illumina_top_rlog.dge <- Illumina_top.rlog[Illumina_top.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(Illumina_top_rlog.dge, scale="row",
         show_rownames=FALSE, main="Illumina DGE",
         color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))

# identify genes with the desired adjusted p-value cut-off
Illumina_bottom.DGEgenes <- rownames(subset(Illumina_bottom.results, padj < 0.05)) 
# extract rlog-transformed values into a matrix
Illumina_bottom_rlog.dge <- Illumina_bottom.rlog[Illumina_bottom.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
# pheatmap(Illumina_bottom_rlog.dge, scale="row",
#          show_rownames=FALSE, main="Illumina DGE",
#          color=colorRampPalette(RColorBrewer::brewer.pal(n=7, name="Reds"))(100))

changing the color scheme on pheatmap to see more contrast

# identify genes with the desired adjusted p-value cut-off
ONT_median.DGEgenes <- rownames(subset(ONT_median.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_median_rlog.dge <- ONT_median.rlog[ONT_median.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_top.DGEgenes <- rownames(subset(ONT_top.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_top_rlog.dge <- ONT_top.rlog[ONT_top.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_bottom.DGEgenes <- rownames(subset(ONT_bottom.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_bottom_rlog.dge <- ONT_bottom.rlog[ONT_bottom.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

Since nanopore is not giving a great result with 500 genes. Let’s try increasing the gene set size.

median_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:1000],]

top_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:1000],]

bottom_1000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:1000],]

sample_genes <- list(
  median500 = median_500_genes,
  top500 = top_500_genes,
  bottom500 = bottom_500_genes,
  median1000 = median_1000_genes,
  top1000 = top_1000_genes,
  bottom1000 = bottom_1000_genes
)

ONT_median1000.ds <- ONT_DESeq_overlap.ds[row.names(median_1000_genes), ]
ONT_top1000.ds <- ONT_DESeq_overlap.ds[row.names(top_1000_genes), ]
ONT_bottom1000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_1000_genes), ]

ONT_median1000.rlog <- rlog(ONT_median1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_top1000.rlog <- rlog(ONT_top1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_bottom1000.rlog <- rlog(ONT_bottom1000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom1000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

design(ONT_median1000.ds) <- ~ condition
ONT_median1000.ds$condition %<>% relevel(ref="GC")
ONT_median1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_median1000.ds %<>% nbinomWaldTest()

ONT_median1000.results <- results(ONT_median1000.ds,independentFiltering = TRUE,alpha = 0.05)

design(ONT_top1000.ds) <- ~ condition
ONT_top1000.ds$condition %<>% relevel(ref="GC")
ONT_top1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_top1000.ds %<>% nbinomWaldTest()

ONT_top1000.results <- results(ONT_top1000.ds,independentFiltering = TRUE,alpha = 0.05)


design(ONT_bottom1000.ds) <- ~ condition
ONT_bottom1000.ds$condition %<>% relevel(ref="GC")
ONT_bottom1000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom1000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom1000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_bottom1000.ds %<>% nbinomWaldTest()

ONT_bottom1000.results <- results(ONT_bottom1000.ds,independentFiltering = TRUE,alpha = 0.05)

# identify genes with the desired adjusted p-value cut-off
ONT_median1000.DGEgenes <- rownames(subset(ONT_median1000.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_median1000_rlog.dge <- ONT_median1000.rlog[ONT_median1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median1000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_top1000.DGEgenes <- rownames(subset(ONT_top1000.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_top1000_rlog.dge <- ONT_top1000.rlog[ONT_top1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top1000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_bottom1000.DGEgenes <- rownames(subset(ONT_bottom1000.results, pvalue<0.05)) 
# extract rlog-transformed values into a matrix
ONT_bottom1000_rlog.dge <- ONT_bottom1000.rlog[ONT_bottom1000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom1000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

median_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:5000],]

top_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:5000],]

bottom_5000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:5000],]

sample_genes <- list(
  median500 = median_500_genes,
  top500 = top_500_genes,
  bottom500 = bottom_500_genes,
  median1000 = median_1000_genes,
  top1000 = top_1000_genes,
  bottom1000 = bottom_1000_genes,
  median5000 = median_5000_genes,
  top5000 = top_5000_genes,
  bottom5000 = bottom_5000_genes
)

ONT_median5000.ds <- ONT_DESeq_overlap.ds[row.names(median_5000_genes), ]
ONT_top5000.ds <- ONT_DESeq_overlap.ds[row.names(top_5000_genes), ]
ONT_bottom5000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_5000_genes), ]

ONT_median5000.rlog <- rlog(ONT_median5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_top5000.rlog <- rlog(ONT_top5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_bottom5000.rlog <- rlog(ONT_bottom5000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom5000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

design(ONT_median5000.ds) <- ~ condition
ONT_median5000.ds$condition %<>% relevel(ref="GC")
ONT_median5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 5 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_median5000.ds %<>% nbinomWaldTest()

ONT_median5000.results <- results(ONT_median5000.ds,independentFiltering = TRUE,alpha = 0.05)

design(ONT_top5000.ds) <- ~ condition
ONT_top5000.ds$condition %<>% relevel(ref="GC")
ONT_top5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_top5000.ds %<>% nbinomWaldTest()

ONT_top5000.results <- results(ONT_top5000.ds,independentFiltering = TRUE,alpha = 0.05)


design(ONT_bottom5000.ds) <- ~ condition
ONT_bottom5000.ds$condition %<>% relevel(ref="GC")
ONT_bottom5000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 9 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom5000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom5000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_bottom5000.ds %<>% nbinomWaldTest()

ONT_bottom5000.results <- results(ONT_bottom5000.ds,independentFiltering = TRUE,alpha = 0.05)

# identify genes with the desired adjusted p-value cut-off
ONT_median5000.DGEgenes <- rownames(subset(ONT_median5000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_median5000_rlog.dge <- ONT_median5000.rlog[ONT_median5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median5000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_top5000.DGEgenes <- rownames(subset(ONT_top5000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_top5000_rlog.dge <- ONT_top5000.rlog[ONT_top5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top5000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_bottom5000.DGEgenes <- rownames(subset(ONT_bottom5000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_bottom5000_rlog.dge <- ONT_bottom5000.rlog[ONT_bottom5000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom5000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

median_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$ab_diff)[1:8000],]

top_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(-gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:8000],]

bottom_8000_genes <- gencode.vM33.All.GeneLengths.filtered[order(gencode.vM33.All.GeneLengths.filtered$GeneLength)[1:8000],]

sample_genes <- list(
  median500 = median_500_genes,
  top500 = top_500_genes,
  bottom500 = bottom_500_genes,
  median1000 = median_1000_genes,
  top1000 = top_1000_genes,
  bottom1000 = bottom_1000_genes,
  median5000 = median_5000_genes,
  top5000 = top_5000_genes,
  bottom5000 = bottom_5000_genes,
  median8000 = median_8000_genes,
  top8000 = top_8000_genes,
  bottom8000 = bottom_8000_genes
)

ONT_median8000.ds <- ONT_DESeq_overlap.ds[row.names(median_8000_genes), ]
ONT_top8000.ds <- ONT_DESeq_overlap.ds[row.names(top_8000_genes), ]
ONT_bottom8000.ds <- ONT_DESeq_overlap.ds[row.names(bottom_8000_genes), ]

ONT_median8000.rlog <- rlog(ONT_median8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_median8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_top8000.rlog <- rlog(ONT_top8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
plotPCA(ONT_top8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

ONT_bottom8000.rlog <- rlog(ONT_bottom8000.ds,blind=TRUE)
## rlog() may take a few minutes with 30 or more samples,
## vst() is a much faster transformation
plotPCA(ONT_bottom8000.rlog, intgroup="condition", ntop = 100, returnData=FALSE)
## using ntop=100 top features by variance

design(ONT_median8000.ds) <- ~ condition
ONT_median8000.ds$condition %<>% relevel(ref="GC")
ONT_median8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 6 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_median8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_median8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_median8000.ds %<>% nbinomWaldTest()

ONT_median8000.results <- results(ONT_median8000.ds,independentFiltering = TRUE,alpha = 0.05)

design(ONT_top8000.ds) <- ~ condition
ONT_top8000.ds$condition %<>% relevel(ref="GC")
ONT_top8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 13 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_top8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_top8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## -- note: fitType='parametric', but the dispersion trend was not well captured by the
##    function: y = a/x + b, and a local regression fit was automatically substituted.
##    specify fitType='local' or 'mean' to avoid this message next time.
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_top8000.ds %<>% nbinomWaldTest()

ONT_top8000.results <- results(ONT_top8000.ds,independentFiltering = TRUE,alpha = 0.05)


design(ONT_bottom8000.ds) <- ~ condition
ONT_bottom8000.ds$condition %<>% relevel(ref="GC")
ONT_bottom8000.ds %<>% DESeq()
## using pre-existing size factors
## estimating dispersions
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## -- replacing outliers and refitting for 12 genes
## -- DESeq argument 'minReplicatesForReplace' = 7 
## -- original counts are preserved in counts(dds)
## estimating dispersions
## fitting model and testing
# normalize for diffs in sequencing depth and abundance per sample
ONT_bottom8000.ds %<>% estimateSizeFactors()
# gene-wise dispersion estimates across all samples
ONT_bottom8000.ds %<>% estimateDispersions()
## found already estimated dispersions, replacing these
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
# fit a neg. binomial GLM and compute Wald stat for each gene 
ONT_bottom8000.ds %<>% nbinomWaldTest()

ONT_bottom8000.results <- results(ONT_bottom8000.ds,independentFiltering = TRUE,alpha = 0.05)

# identify genes with the desired adjusted p-value cut-off
ONT_median8000.DGEgenes <- rownames(subset(ONT_median8000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_median8000_rlog.dge <- ONT_median8000.rlog[ONT_median8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_median8000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_top8000.DGEgenes <- rownames(subset(ONT_top8000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_top8000_rlog.dge <- ONT_top8000.rlog[ONT_top8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_top8000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

# identify genes with the desired adjusted p-value cut-off
ONT_bottom8000.DGEgenes <- rownames(subset(ONT_bottom8000.results, padj<0.05)) 
# extract rlog-transformed values into a matrix
ONT_bottom8000_rlog.dge <- ONT_bottom8000.rlog[ONT_bottom8000.DGEgenes,] %>% assay
# heatmap of DEG sorted by p.adjust
pheatmap(ONT_bottom8000_rlog.dge, scale="row",
         show_rownames=FALSE, main="ONT DGE",
         color = colorRampPalette(c("blue", "red"))(100))

Length Composition Analysis

gencode.vM33.ONT.GeneLengths <- gencode.vM33.All.GeneLengths[row.names(ONT_DESeq.ds),]
gencode.vM33.ONT.GeneLengths.overlap <- gencode.vM33.All.GeneLengths.filtered[row.names(ONT_DESeq_overlap.ds),]

hist(gencode.vM33.ONT.GeneLengths$log2.GeneLength, freq = FALSE)

ONT_medians <- apply(counts(ONT_DESeq.ds), 1, median)

gencode.vM33.ONT.GeneLengths$medianCounts <- ONT_medians[match(rownames(gencode.vM33.ONT.GeneLengths), names(ONT_medians))]
gencode.vM33.ONT.GeneLengths.overlap$medianCounts <- ONT_medians[match(rownames(gencode.vM33.ONT.GeneLengths.overlap), names(ONT_medians))]

#Scatter plot of log(medianCounts) and log(geneLengths)
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.ONT.GeneLengths, 
     main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
     xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = "black", pch = 16)

gencode.vM33.Illumina.GeneLengths <- gencode.vM33.All.GeneLengths[row.names(Illumina_DESeq.ds),]
gencode.vM33.Illumina.GeneLengths.overlap <- gencode.vM33.All.GeneLengths.filtered[row.names(Illumina_DESeq_overlap.ds),]

hist(gencode.vM33.Illumina.GeneLengths$log2.GeneLength, freq = FALSE)

Illumina_medians <- apply(counts(Illumina_DESeq.ds), 1, median)

gencode.vM33.Illumina.GeneLengths$medianCounts <- Illumina_medians[match(rownames(gencode.vM33.Illumina.GeneLengths), names(Illumina_medians))]
gencode.vM33.Illumina.GeneLengths.overlap$medianCounts <- Illumina_medians[match(rownames(gencode.vM33.Illumina.GeneLengths.overlap), names(Illumina_medians))]

#Scatter plot of log(medianCounts) and log(geneLengths)
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.Illumina.GeneLengths, 
     main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
     xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = "black", pch = 16)

ONT_hist1 <- hist(gencode.vM33.ONT.GeneLengths$log2.GeneLength, freq = FALSE)

Illumina_hist1 <- hist(gencode.vM33.Illumina.GeneLengths$log2.GeneLength, freq = FALSE)

# Set the alpha (transparency) value, e.g., 0.5 for 50% transparency
transparency <- 0.5

# Create translucent colors using the rgb function
red_translucent <- rgb(1, 0, 0, alpha = transparency)
blue_translucent <- rgb(0, 0, 1, alpha = transparency)

# Find the overall range of y values
y_max <- max(c(ONT_hist1$counts, Illumina_hist1$counts))

# Plot histograms with translucent colors and adjusted y-axis limits
plot(ONT_hist1, col = red_translucent, ylim = c(0, y_max))  # first histogram
plot(Illumina_hist1, col = blue_translucent, add = TRUE)     # second histogram

#increase transparency
transparency <- 0.3
# Create translucent colors using the rgb function
red_translucent <- rgb(1, 0, 0, alpha = transparency)
blue_translucent <- rgb(0, 0, 1, alpha = transparency)

# Scatter plot of log(medianCounts) and log(geneLengths) for ONT data
plot(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.ONT.GeneLengths.overlap,
     main = "Scatter Plot of log2(Median Counts) vs. log2(Gene Length)",
     xlab = "log2(Gene Length)", ylab = "log2(Median Counts)", col = red_translucent, pch = 1,cex=0.1)

# Add points for Illumina data on the same plot
points(log2(medianCounts) ~ log2.GeneLength, data = gencode.vM33.Illumina.GeneLengths.overlap,
       col = blue_translucent, pch = 2,cex=0.1)

Illumina overestimates/gives more read counts to longer genes.

###Fit distributions of the scatter plots.

#Combine the median gene counts from both platforms
gencode.vM33.Illumina.GeneLengths$log2.medianCounts <- log2(gencode.vM33.Illumina.GeneLengths$medianCounts + 1)
gencode.vM33.ONT.GeneLengths$log2.medianCounts <- log2(gencode.vM33.ONT.GeneLengths$medianCounts + 1)

gencode.vM33.Illumina.GeneLengths.overlap$log2.medianCounts <- log2(gencode.vM33.Illumina.GeneLengths.overlap$medianCounts + 1)
gencode.vM33.ONT.GeneLengths.overlap$log2.medianCounts <- log2(gencode.vM33.ONT.GeneLengths.overlap$medianCounts + 1)

gencode.vM33.Combined.GeneLengths <- cbind(gencode.vM33.Illumina.GeneLengths.overlap,gencode.vM33.ONT.GeneLengths.overlap[,5:6])

selected_columns1 <- 5:6
prefix1 <- "Illumina_" 

colnames(gencode.vM33.Combined.GeneLengths)[selected_columns1] <- paste0(prefix1,colnames(gencode.vM33.Illumina.GeneLengths.overlap)[selected_columns1])

selected_columns2 <- 7:8
prefix2 <- "ONT_"
colnames(gencode.vM33.Combined.GeneLengths)[selected_columns2] <- paste0(prefix2,colnames(gencode.vM33.ONT.GeneLengths.overlap)[selected_columns1])
hist(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts,freq = FALSE)
lines(density(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts))

hist(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, freq = FALSE)
lines(density(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts))

par(mar = c(5, 5, 2, 2))
descdist(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts, discrete = FALSE)

## summary statistics
## ------
## min:  0   max:  19.29066 
## median:  5.658211 
## mean:  5.414033 
## estimated sd:  3.488715 
## estimated skewness:  -0.001768405 
## estimated kurtosis:  1.949425
descdist(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, discrete = FALSE)

## summary statistics
## ------
## min:  0   max:  13.17344 
## median:  0.5849625 
## mean:  1.503343 
## estimated sd:  1.878414 
## estimated skewness:  1.169843 
## estimated kurtosis:  3.916078
cor(gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts, gencode.vM33.Combined.GeneLengths$log2.GeneLength, method='pearson')
## [1] 0.4780111
cor(gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts, gencode.vM33.Combined.GeneLengths$log2.GeneLength, method='pearson')
## [1] 0.2076635

Since the correlations are not linear for ONT, I will try to assess the correlation assuming the data is non-linearly correlated.

nlcor(gencode.vM33.Combined.GeneLengths$log2.GeneLength,gencode.vM33.Combined.GeneLengths$Illumina_log2.medianCounts,plt=T)

## $cor.estimate
## [1] 0.4780111
## 
## $adjusted.p.value
## [1] 0
## 
## $cor.plot

nlcor(gencode.vM33.Combined.GeneLengths$log2.GeneLength,gencode.vM33.Combined.GeneLengths$ONT_log2.medianCounts,plt=T)

## $cor.estimate
## [1] 0.2076635
## 
## $adjusted.p.value
## [1] 0
## 
## $cor.plot

There is negligible correlation between nanopore sequencing gene counts and gene length.